Project Kaggle - Marketing Analytics
In [14]:
import kagglehub
import os
from skimpy import skim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats
import seaborn as sns
from IPython.display import Image, display
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')
In [66]:
from MC_Functions import *
def Column_distribution(DF):
"""
Displays distributions, boxplots, and identifies outliers for each numeric column in a DataFrame.
Arguments:
DF: The pandas DataFrame containing the data.
"""
DF = DF.select_dtypes(include=[np.number])
for i, column in enumerate(DF.columns, start=0):
# Chart to visualize outliers
plt.figure(figsize=(10, 2.5))
plt.subplot(1, 2, 1)
# Graphical visualization
sns.histplot(DF[column],bins=30, kde=True, color='teal', edgecolor='black', linewidth=1, line_kws={'color': 'red'})
plt.title(f'Distribution: {column}', fontsize=12, color='firebrick', fontweight='bold')
# Box plot to detect outliers
plt.subplot(1, 2, 2)
# Visualization as a boxplot
sns.boxplot(x=DF[column], color='teal',
notch=True, whiskerprops={'linewidth': 1}, showmeans=True,
meanprops=dict(marker='p', markerfacecolor='white', markeredgecolor='black', markersize=8),
medianprops=dict(linestyle='-', linewidth=2, color='firebrick'),
flierprops=dict(marker='p', markerfacecolor='firebrick', markersize=5, markeredgecolor='black'),
boxprops=dict(linestyle='-', linewidth=1.5),
showfliers=True, width=0.4)
plt.title(f'Boxplot: {column}', fontsize=12, color='firebrick', fontweight='bold')
# Median & mean value display:
median_value = np.median(DF[column])
moy_value = np.mean(DF[column])
plt.axvline(x=median_value, ymin=0.5, ymax=0, color='firebrick', linestyle=':', linewidth=1)
plt.axvline(x=moy_value, ymin=0.5, ymax=1, color='blue', linestyle=':', linewidth=1)
plt.text(moy_value, -0.3, f'Mean: {moy_value:.2f}', color='blue', fontsize=12, ha='left')
plt.text(median_value, 0.4, f'Median: {median_value:.2f}', color='firebrick', fontsize=12, ha='right')
plt.tight_layout()
plt.show()
# IQR calculation
Q1 = DF[column].quantile(0.25)
Q3 = DF[column].quantile(0.75)
IQR = Q3 - Q1
# Determination of thresholds for Outliers
seuil_inf = Q1 - 1.5 * IQR
seuil_sup = Q3 + 1.5 * IQR
# Outlier Identification
outliers = DF[(DF[column] < seuil_inf) | (DF[column] > seuil_sup)]
# Outlier Display
Tmess('{} : {} Outliers'.format(column, outliers.shape[0]), Color='black')
Tmess('---------------------')
📊 About the Dataset¶
🌐 Context¶
This dataset is publicly available on GitHub and can be used for:
- 🔍 Exploratory Data Analysis (EDA)
- 📈 Statistical Analysis
- 📊 Data Visualizations
📁 Content¶
The dataset, ifood_df.csv, contains information on 2,205 customers from the XYZ company. It provides valuable insights into key areas such as:
- 🧑💼 Customer Profiles — Demographic and personal information about customers.
- 🛒 Product Preferences — Details on customer preferences for specific products.
- 🚀 Campaign Successes & Failures — Data on the performance of marketing campaigns.
- 🌐 Channel Performance — Analysis of customer interactions across different channels.
In [69]:
# Dictionary display
display(Image(filename='dictionary.png'))
In [73]:
# Reading th csv file
data = pd.read_csv('ifood_df.csv')
df=data.copy()
# Display
display(df.head(2))
Tmess('Dataframe dimensions: {} rows and {} columns'.format(df.shape[0],df.shape[1]), Color='blue', Size=12)
| Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | ... | marital_Together | marital_Widow | education_2n Cycle | education_Basic | education_Graduation | education_Master | education_PhD | MntTotal | MntRegularProds | AcceptedCmpOverall | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 58138.0 | 0 | 0 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1529 | 1441 | 0 |
| 1 | 46344.0 | 1 | 1 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 21 | 15 | 0 |
2 rows × 39 columns
Dataframe dimensions: 2205 rows and 39 columns
In [74]:
pd.set_option('display.max_columns', None)
df.describe()
Out[74]:
| Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | Age | Customer_Days | marital_Divorced | marital_Married | marital_Single | marital_Together | marital_Widow | education_2n Cycle | education_Basic | education_Graduation | education_Master | education_PhD | MntTotal | MntRegularProds | AcceptedCmpOverall | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.0 | 2205.0 | 2205.00000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.00000 |
| mean | 51622.094785 | 0.442177 | 0.506576 | 49.009070 | 306.164626 | 26.403175 | 165.312018 | 37.756463 | 27.128345 | 44.057143 | 2.318367 | 4.100680 | 2.645351 | 5.823583 | 5.336961 | 0.073923 | 0.074376 | 0.073016 | 0.064399 | 0.013605 | 0.009070 | 3.0 | 11.0 | 0.15102 | 51.095692 | 2512.718367 | 0.104308 | 0.387302 | 0.216327 | 0.257596 | 0.034467 | 0.089796 | 0.024490 | 0.504762 | 0.165079 | 0.215873 | 562.764626 | 518.707483 | 0.29932 |
| std | 20713.063826 | 0.537132 | 0.544380 | 28.932111 | 337.493839 | 39.784484 | 217.784507 | 54.824635 | 41.130468 | 51.736211 | 1.886107 | 2.737424 | 2.798647 | 3.241796 | 2.413535 | 0.261705 | 0.262442 | 0.260222 | 0.245518 | 0.115872 | 0.094827 | 0.0 | 0.0 | 0.35815 | 11.705801 | 202.563647 | 0.305730 | 0.487244 | 0.411833 | 0.437410 | 0.182467 | 0.285954 | 0.154599 | 0.500091 | 0.371336 | 0.411520 | 575.936911 | 553.847248 | 0.68044 |
| min | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.00000 | 24.000000 | 2159.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | -283.000000 | 0.00000 |
| 25% | 35196.000000 | 0.000000 | 0.000000 | 24.000000 | 24.000000 | 2.000000 | 16.000000 | 3.000000 | 1.000000 | 9.000000 | 1.000000 | 2.000000 | 0.000000 | 3.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.00000 | 43.000000 | 2339.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 56.000000 | 42.000000 | 0.00000 |
| 50% | 51287.000000 | 0.000000 | 0.000000 | 49.000000 | 178.000000 | 8.000000 | 68.000000 | 12.000000 | 8.000000 | 25.000000 | 2.000000 | 4.000000 | 2.000000 | 5.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.00000 | 50.000000 | 2515.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 343.000000 | 288.000000 | 0.00000 |
| 75% | 68281.000000 | 1.000000 | 1.000000 | 74.000000 | 507.000000 | 33.000000 | 232.000000 | 50.000000 | 34.000000 | 56.000000 | 3.000000 | 6.000000 | 4.000000 | 8.000000 | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.00000 | 61.000000 | 2688.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 964.000000 | 884.000000 | 0.00000 |
| max | 113734.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | 262.000000 | 321.000000 | 15.000000 | 27.000000 | 28.000000 | 13.000000 | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.0 | 11.0 | 1.00000 | 80.000000 | 2858.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2491.000000 | 2458.000000 | 4.00000 |
In [76]:
skim(df)
╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮ │ Data Summary Data Types │ │ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │ │ ┃ dataframe ┃ Values ┃ ┃ Column Type ┃ Count ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │ │ │ Number of rows │ 2205 │ │ int32 │ 38 │ │ │ │ Number of columns │ 39 │ │ float64 │ 1 │ │ │ └───────────────────┴────────┘ └─────────────┴───────┘ │ │ number │ │ ┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │ │ ┃ column_name ┃ NA ┃ NA % ┃ mean ┃ sd ┃ p0 ┃ p25 ┃ p50 ┃ p75 ┃ p100 ┃ hist ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │ │ │ Income │ 0 │ 0 │ 51620 │ 20710 │ 1730 │ 35200 │ 51290 │ 68280 │ 113700 │ ▂▇▇▇▃ │ │ │ │ Kidhome │ 0 │ 0 │ 0.4422 │ 0.5371 │ 0 │ 0 │ 0 │ 1 │ 2 │ ▇ ▆ │ │ │ │ Teenhome │ 0 │ 0 │ 0.5066 │ 0.5444 │ 0 │ 0 │ 0 │ 1 │ 2 │ ▇ ▇ │ │ │ │ Recency │ 0 │ 0 │ 49.01 │ 28.93 │ 0 │ 24 │ 49 │ 74 │ 99 │ ▇▇▇▇▇▇ │ │ │ │ MntWines │ 0 │ 0 │ 306.2 │ 337.5 │ 0 │ 24 │ 178 │ 507 │ 1493 │ ▇▂▂▁▁ │ │ │ │ MntFruits │ 0 │ 0 │ 26.4 │ 39.78 │ 0 │ 2 │ 8 │ 33 │ 199 │ ▇▁▁ │ │ │ │ MntMeatProducts │ 0 │ 0 │ 165.3 │ 217.8 │ 0 │ 16 │ 68 │ 232 │ 1725 │ ▇▁▁ │ │ │ │ MntFishProducts │ 0 │ 0 │ 37.76 │ 54.82 │ 0 │ 3 │ 12 │ 50 │ 259 │ ▇▁▁ │ │ │ │ MntSweetProducts │ 0 │ 0 │ 27.13 │ 41.13 │ 0 │ 1 │ 8 │ 34 │ 262 │ ▇▁▁ │ │ │ │ MntGoldProds │ 0 │ 0 │ 44.06 │ 51.74 │ 0 │ 9 │ 25 │ 56 │ 321 │ ▇▂▁ │ │ │ │ NumDealsPurchases │ 0 │ 0 │ 2.318 │ 1.886 │ 0 │ 1 │ 2 │ 3 │ 15 │ ▇▃▁ │ │ │ │ NumWebPurchases │ 0 │ 0 │ 4.101 │ 2.737 │ 0 │ 2 │ 4 │ 6 │ 27 │ ▇▃▁ │ │ │ │ NumCatalogPurchases │ 0 │ 0 │ 2.645 │ 2.799 │ 0 │ 0 │ 2 │ 4 │ 28 │ ▇▂ │ │ │ │ NumStorePurchases │ 0 │ 0 │ 5.824 │ 3.242 │ 0 │ 3 │ 5 │ 8 │ 13 │ ▂▇▃▃▂▃ │ │ │ │ NumWebVisitsMonth │ 0 │ 0 │ 5.337 │ 2.414 │ 0 │ 3 │ 6 │ 7 │ 20 │ ▅▇▇ │ │ │ │ AcceptedCmp3 │ 0 │ 0 │ 0.07392 │ 0.2617 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▁ │ │ │ │ AcceptedCmp4 │ 0 │ 0 │ 0.07438 │ 0.2624 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▁ │ │ │ │ AcceptedCmp5 │ 0 │ 0 │ 0.07302 │ 0.2602 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▁ │ │ │ │ AcceptedCmp1 │ 0 │ 0 │ 0.0644 │ 0.2455 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▁ │ │ │ │ AcceptedCmp2 │ 0 │ 0 │ 0.01361 │ 0.1159 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ │ │ │ │ Complain │ 0 │ 0 │ 0.00907 │ 0.09483 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ │ │ │ │ Z_CostContact │ 0 │ 0 │ 3 │ 0 │ 3 │ 3 │ 3 │ 3 │ 3 │ ▇ │ │ │ │ Z_Revenue │ 0 │ 0 │ 11 │ 0 │ 11 │ 11 │ 11 │ 11 │ 11 │ ▇ │ │ │ │ Response │ 0 │ 0 │ 0.151 │ 0.3581 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▁ │ │ │ │ Age │ 0 │ 0 │ 51.1 │ 11.71 │ 24 │ 43 │ 50 │ 61 │ 80 │ ▂▅▇▆▅▁ │ │ │ │ Customer_Days │ 0 │ 0 │ 2513 │ 202.6 │ 2159 │ 2339 │ 2515 │ 2688 │ 2858 │ ▇▇▇▇▇▇ │ │ │ │ marital_Divorced │ 0 │ 0 │ 0.1043 │ 0.3057 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▁ │ │ │ │ marital_Married │ 0 │ 0 │ 0.3873 │ 0.4872 │ 0 │ 0 │ 0 │ 1 │ 1 │ ▇ ▅ │ │ │ │ marital_Single │ 0 │ 0 │ 0.2163 │ 0.4118 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▂ │ │ │ │ marital_Together │ 0 │ 0 │ 0.2576 │ 0.4374 │ 0 │ 0 │ 0 │ 1 │ 1 │ ▇ ▃ │ │ │ │ marital_Widow │ 0 │ 0 │ 0.03447 │ 0.1825 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ │ │ │ │ education_2n Cycle │ 0 │ 0 │ 0.0898 │ 0.286 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▁ │ │ │ │ education_Basic │ 0 │ 0 │ 0.02449 │ 0.1546 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ │ │ │ │ education_Graduation │ 0 │ 0 │ 0.5048 │ 0.5001 │ 0 │ 0 │ 1 │ 1 │ 1 │ ▇ ▇ │ │ │ │ education_Master │ 0 │ 0 │ 0.1651 │ 0.3713 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▂ │ │ │ │ education_PhD │ 0 │ 0 │ 0.2159 │ 0.4115 │ 0 │ 0 │ 0 │ 0 │ 1 │ ▇ ▂ │ │ │ │ MntTotal │ 0 │ 0 │ 562.8 │ 575.9 │ 4 │ 56 │ 343 │ 964 │ 2491 │ ▇▂▂▁▁ │ │ │ │ MntRegularProds │ 0 │ 0 │ 518.7 │ 553.8 │ -283 │ 42 │ 288 │ 884 │ 2458 │ ▇▅▃▂▁ │ │ │ │ AcceptedCmpOverall │ 0 │ 0 │ 0.2993 │ 0.6804 │ 0 │ 0 │ 0 │ 0 │ 4 │ ▇▁ │ │ │ └────────────────────────┴─────┴───────┴──────────┴─────────┴──────┴───────┴───────┴───────┴────────┴────────┘ │ ╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯
👉 No missing value
👉 The dataframe only contains numerical variables.
👉 Ouch, no new customers for more than 2150 days! Let's fix it:
- The dataset is a bit outdated, so I will reduce 'Customer_Days' by 2150 days.
👉 Irrelevant variables : 'Z_CostContact', 'Z_Revenue'
In [79]:
# Update 'Customer_Days'
df['Customer_Days'] = df['Customer_Days'] - 2150
print(df['Customer_Days'].describe())
Tmess("Great, last customer acquired only {} days ago 😉".format(df['Customer_Days'].min()), Color='blue', Size=12)
count 2205.000000 mean 362.718367 std 202.563647 min 9.000000 25% 189.000000 50% 365.000000 75% 538.000000 max 708.000000 Name: Customer_Days, dtype: float64
Great, last customer acquired only 9 days ago 😉
In [80]:
# check for presence of irrelevant variables and delete them if necessary
cols_to_remove = ['Z_CostContact', 'Z_Revenue']
df = df.drop(columns=[col for col in cols_to_remove if col in df.columns])
df.describe()
Out[80]:
| Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | Customer_Days | marital_Divorced | marital_Married | marital_Single | marital_Together | marital_Widow | education_2n Cycle | education_Basic | education_Graduation | education_Master | education_PhD | MntTotal | MntRegularProds | AcceptedCmpOverall | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.00000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.000000 | 2205.00000 |
| mean | 51622.094785 | 0.442177 | 0.506576 | 49.009070 | 306.164626 | 26.403175 | 165.312018 | 37.756463 | 27.128345 | 44.057143 | 2.318367 | 4.100680 | 2.645351 | 5.823583 | 5.336961 | 0.073923 | 0.074376 | 0.073016 | 0.064399 | 0.013605 | 0.009070 | 0.15102 | 51.095692 | 362.718367 | 0.104308 | 0.387302 | 0.216327 | 0.257596 | 0.034467 | 0.089796 | 0.024490 | 0.504762 | 0.165079 | 0.215873 | 562.764626 | 518.707483 | 0.29932 |
| std | 20713.063826 | 0.537132 | 0.544380 | 28.932111 | 337.493839 | 39.784484 | 217.784507 | 54.824635 | 41.130468 | 51.736211 | 1.886107 | 2.737424 | 2.798647 | 3.241796 | 2.413535 | 0.261705 | 0.262442 | 0.260222 | 0.245518 | 0.115872 | 0.094827 | 0.35815 | 11.705801 | 202.563647 | 0.305730 | 0.487244 | 0.411833 | 0.437410 | 0.182467 | 0.285954 | 0.154599 | 0.500091 | 0.371336 | 0.411520 | 575.936911 | 553.847248 | 0.68044 |
| min | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 24.000000 | 9.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | -283.000000 | 0.00000 |
| 25% | 35196.000000 | 0.000000 | 0.000000 | 24.000000 | 24.000000 | 2.000000 | 16.000000 | 3.000000 | 1.000000 | 9.000000 | 1.000000 | 2.000000 | 0.000000 | 3.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 43.000000 | 189.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 56.000000 | 42.000000 | 0.00000 |
| 50% | 51287.000000 | 0.000000 | 0.000000 | 49.000000 | 178.000000 | 8.000000 | 68.000000 | 12.000000 | 8.000000 | 25.000000 | 2.000000 | 4.000000 | 2.000000 | 5.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 50.000000 | 365.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 343.000000 | 288.000000 | 0.00000 |
| 75% | 68281.000000 | 1.000000 | 1.000000 | 74.000000 | 507.000000 | 33.000000 | 232.000000 | 50.000000 | 34.000000 | 56.000000 | 3.000000 | 6.000000 | 4.000000 | 8.000000 | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 61.000000 | 538.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 964.000000 | 884.000000 | 0.00000 |
| max | 113734.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | 262.000000 | 321.000000 | 15.000000 | 27.000000 | 28.000000 | 13.000000 | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 80.000000 | 708.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2491.000000 | 2458.000000 | 4.00000 |
In [83]:
# -----------------------------------Verification of the amounts indicated
# Total spending
df['Spending'] = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis=1)
# Total regular spending
df['Spending_Regular'] = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts']].sum(axis=1)
# Display
df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'MntTotal', 'MntRegularProds',
'Spending', 'Spending_Regular']].head(2)
Out[83]:
| MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | MntTotal | MntRegularProds | Spending | Spending_Regular | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 635 | 88 | 546 | 172 | 88 | 88 | 1529 | 1441 | 1617 | 1529 |
| 1 | 11 | 1 | 6 | 2 | 1 | 6 | 21 | 15 | 27 | 21 |
👉 First point to be confirmed with the company:
- Use of 'MntTotal' & 'MntRegularProds' data for the rest of the analysis?
👉 For this analysis, I will use the calculated variables 'Spending' and 'Spending_Regular'.
In [89]:
# Removal of inconsistent variables.
df.drop(['MntTotal', 'MntRegularProds'], axis=1, inplace=True)
# Rename calculated columns.
df.rename(columns={'Spending': 'MntTotal', 'Spending_Regular': 'MntRegularProds'}, inplace=True)
In [92]:
df.loc[:, df.columns.str.startswith('education')].head(2)
Out[92]:
| education_2n Cycle | education_Basic | education_Graduation | education_Master | education_PhD | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 0 | 0 | 1 | 0 | 0 |
In [94]:
# Create the 'education' column with the level of education
df['Education_level'] = df.apply(
lambda row: 1 if row['education_Basic'] == 1 else (
2 if row['education_2n Cycle'] == 1 else (
3 if row['education_Graduation'] == 1 else (
4 if row['education_Master'] == 1 else (
5 if row['education_PhD'] == 1 else 0)))),
axis=1)
df.drop(['education_2n Cycle', 'education_Basic', 'education_Graduation', 'education_Master', 'education_PhD'], axis=1, inplace=True)
df.head(2)
Out[94]:
| Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | Customer_Days | marital_Divorced | marital_Married | marital_Single | marital_Together | marital_Widow | AcceptedCmpOverall | MntTotal | MntRegularProds | Education_level | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 58138.0 | 0 | 0 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 3 | 8 | 10 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 63 | 672 | 0 | 0 | 1 | 0 | 0 | 0 | 1617 | 1529 | 3 |
| 1 | 46344.0 | 1 | 1 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 1 | 1 | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 66 | 122 | 0 | 0 | 1 | 0 | 0 | 0 | 27 | 21 | 3 |
In [97]:
df.loc[:, df.columns.str.startswith('marital') | df.columns.str.startswith('Kid') | df.columns.str.startswith('Teen')].head(2)
Out[97]:
| Kidhome | Teenhome | marital_Divorced | marital_Married | marital_Single | marital_Together | marital_Widow | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
In [99]:
# Create the 'Adult_household' column with number of adult
df['Adult_household'] = df.apply(
lambda row: 1 if row['marital_Divorced'] == 1 else (
2 if row['marital_Married'] == 1 else (
1 if row['marital_Single'] == 1 else (
2 if row['marital_Together'] == 1 else (
1 if row['marital_Widow'] == 1 else 0)))),
axis=1)
df['Marital'] = df.apply(
lambda row: 'Divorced' if row['marital_Divorced'] == 1 else (
'Married' if row['marital_Married'] == 1 else (
'Single' if row['marital_Single'] == 1 else (
'Together' if row['marital_Together'] == 1 else (
'Widow' if row['marital_Widow'] == 1 else 0)))),
axis=1)
# Create the 'People_household' column with adult + children
df['People_household'] = df['Adult_household'] + df['Kidhome'] + df['Teenhome']
# Removal of 'marital' variables
df.drop(['marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow'], axis=1, inplace=True)
# Display
df[[ 'Teenhome', 'Kidhome', 'Marital', 'Adult_household', 'People_household']].head(2)
Out[99]:
| Teenhome | Kidhome | Marital | Adult_household | People_household | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | Single | 1 | 1 |
| 1 | 1 | 1 | Single | 1 | 3 |
In [102]:
df.loc[:, df.columns.str.endswith('Purchases')].head(2)
Out[102]:
| NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | |
|---|---|---|---|---|
| 0 | 3 | 8 | 10 | 4 |
| 1 | 2 | 1 | 1 | 2 |
In [104]:
# Total Purchases
df['Total_Purchases'] = df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].sum(axis=1)
df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'Total_Purchases']].head(2)
Out[104]:
| NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | Total_Purchases | |
|---|---|---|---|---|---|
| 0 | 3 | 8 | 10 | 4 | 25 |
| 1 | 2 | 1 | 1 | 2 | 6 |
In [107]:
df[df['Total_Purchases'] == 0][['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'Total_Purchases', 'MntTotal']]
Out[107]:
| MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | Total_Purchases | MntTotal | |
|---|---|---|---|---|---|---|---|---|
| 961 | 2 | 1 | 1 | 1 | 0 | 1 | 0 | 6 |
| 1499 | 2 | 1 | 1 | 0 | 0 | 1 | 0 | 5 |
In [109]:
df[df['Total_Purchases'] == 0][['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'Total_Purchases', 'MntTotal']]
Out[109]:
| NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | Total_Purchases | MntTotal | |
|---|---|---|---|---|---|---|
| 961 | 0 | 0 | 0 | 0 | 0 | 6 |
| 1499 | 0 | 0 | 0 | 0 | 0 | 5 |
👉 There is no amount without purchases, I delete these two raws
In [112]:
# line removal
df = df[df['Total_Purchases'] != 0]
In [115]:
# Verify the inconsistent ratio MntTotal/Income.
df[(df['MntTotal'] / df['Income']) >0.5]
Out[115]:
| Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | Customer_Days | AcceptedCmpOverall | MntTotal | MntRegularProds | Education_level | Adult_household | Marital | People_household | Total_Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20 | 2447.0 | 1 | 0 | 42 | 1 | 1 | 1725 | 1 | 1 | 1 | 15 | 0 | 28 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 41 | 548 | 0 | 1730 | 1729 | 3 | 2 | Married | 3 | 43 |
In [117]:
# line removal
df = df[(df['MntTotal'] / df['Income']) <0.5]
In [119]:
df_Prepared = df.copy()
# Save df:
df_Prepared.to_csv('df_Prepared.csv', index=False)
In [128]:
# Distribution of nonbinary variables:
df_no_binary = df.drop(['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4',
'AcceptedCmp5', 'AcceptedCmpOverall', 'Complain', 'Response',
'Marital', 'People_household', 'Adult_household', 'Kidhome',
'Teenhome', 'Education_level'], axis=1)
# Visualization
Column_distribution(df_no_binary)
# Distribution of nonbinary variables:
df_binary = df[['AcceptedCmp1','AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmpOverall',
'Complain', 'Response', 'Marital', 'People_household', 'Adult_household', 'Kidhome', 'Teenhome', 'Education_level']]
# Distribution of ordinal variables:
cols_per_row = 4
num_cols = len(df_binary.columns) # Nombre total de colonnes à afficher
num_rows = -(-num_cols // cols_per_row) # Calcul du nombre de lignes (division entière arrondie vers le haut)
# Creating the Subchart Grid
fig, axes = plt.subplots(num_rows, cols_per_row, figsize=(20, num_rows * 3)) # Taille ajustée dynamiquement
axes = axes.flatten() # Convertit les axes en une liste 1D
# Visualization
Tmess("Distribution of binary and ordinal variables", Align='center', Color='Firebrick', Size='16', Weight='Bold')
for i, col in enumerate(df_binary.columns):
sns.countplot(x=col, data=df, palette='viridis', ax=axes[i])
axes[i].set_title(f'Distribution of {col}', size=16, color='firebrick')
axes[i].set_xlabel('')
axes[i].set_ylabel('Count')
# Delete the empty columns
for i in range(num_cols, len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
Income : 0 Outliers
---------------------
Recency : 0 Outliers
---------------------
MntWines : 34 Outliers
---------------------
MntFruits : 245 Outliers
---------------------
MntMeatProducts : 169 Outliers
---------------------
MntFishProducts : 222 Outliers
---------------------
MntSweetProducts : 238 Outliers
---------------------
MntGoldProds : 201 Outliers
---------------------
NumDealsPurchases : 81 Outliers
---------------------
NumWebPurchases : 3 Outliers
---------------------
NumCatalogPurchases : 19 Outliers
---------------------
NumStorePurchases : 0 Outliers
---------------------
NumWebVisitsMonth : 6 Outliers
---------------------
Age : 0 Outliers
---------------------
Customer_Days : 0 Outliers
---------------------
MntTotal : 3 Outliers
---------------------
MntRegularProds : 3 Outliers
---------------------
Total_Purchases : 0 Outliers
---------------------
Distribution of binary and ordinal variables
In [131]:
# Convert the 'icefire' palette to a hexadecimal list
icefire_palette = sns.color_palette('icefire', as_cmap=False).as_hex()
Tmess("Interactive Bubble Chart: Income vs MntTotal\nsize = 'People_household'", Color='firebrick', Align='center', Size=15, Weight='bold')
# Creating the interactive chart
fig = px.scatter(
df, x='Income', y='MntTotal', color='Marital', size='People_household',
hover_data=[df.index, 'Income', 'MntTotal', 'Marital', 'People_household', 'Total_Purchases'],
labels={'Income': 'Income', 'MntTotal': 'Total Spent'}, color_discrete_sequence=icefire_palette )
# Updated Figure Size
fig.update_layout(
title_font=dict(size=5, color='firebrick', family='Arial'),
width=1100, height=400,
legend=dict(title='Marital', x=1, y=1),
plot_bgcolor='white',
xaxis=dict(tickfont=dict(color='black'), linecolor='black', gridcolor='gray'),
yaxis=dict(tickfont=dict(color='black'), linecolor='black', gridcolor='gray' )
)
fig.show()
Tmess("With the regression lines", Color='firebrick', Align='center', Size=15, Weight='bold')
sns.lmplot(data=df, x='Income', y='MntTotal', hue='Marital', palette='icefire', height=4, aspect=2.4, legend=False)
# Visualization
plt.xlabel('Income')
plt.ylabel('Total Spent (MntTotal)')
plt.legend(title='Marital', bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(True)
plt.show()
Interactive Bubble Chart: Income vs MntTotal
size = 'People_household'
With the regression lines
In [133]:
# Creation and preparation of the dataframe for analysis
df_child = df.copy()
df_child['Children'] = df['Kidhome'] + df['Teenhome']
# Add 'SingleChildren' column
df_child['SingleChildren'] = np.where(df_child['Adult_household'] == 1, df_child['Kidhome'] + df_child['Teenhome'], 0)
# Add 'CoupleChildren' column
df_child['CoupleChildren'] = np.where(df_child['Adult_household'] == 2, df_child['Kidhome'] + df_child['Teenhome'], 0)
# Save df:
df_child.to_csv('df_child.csv', index=False)
df_child.head(2)
Out[133]:
| Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | Customer_Days | AcceptedCmpOverall | MntTotal | MntRegularProds | Education_level | Adult_household | Marital | People_household | Total_Purchases | Children | SingleChildren | CoupleChildren | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 58138.0 | 0 | 0 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 3 | 8 | 10 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 63 | 672 | 0 | 1617 | 1529 | 3 | 1 | Single | 1 | 25 | 0 | 0 | 0 |
| 1 | 46344.0 | 1 | 1 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 1 | 1 | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 66 | 122 | 0 | 27 | 21 | 3 | 1 | Single | 3 | 6 | 2 | 2 | 0 |
In [134]:
# --- Creation of the pivot table (sum of MntTotal for each Marital and People_household). ---
# Expenses
Df_Marital_expenses = df_child.pivot_table(values='MntTotal', index='Marital', columns='People_household', aggfunc='sum', fill_value=0)
Df_Marital_exp = Df_Marital_expenses.reset_index()
Df_Marital_exp.columns.name = None
Df_Marital_exp = Df_Marital_exp.melt(id_vars='Marital', var_name='People_household', value_name='Count')
# Count
Df_Marital_count = pd.crosstab(df_child['Marital'], df_child['People_household'])
display(Df_Marital_expenses, Df_Marital_count)
| People_household | 1 | 2 | 3 | 4 | 5 |
|---|---|---|---|---|---|
| Marital | |||||
| Divorced | 58544 | 69579 | 13000 | 466 | 0 |
| Married | 0 | 237934 | 216730 | 42411 | 4747 |
| Single | 190930 | 82933 | 15605 | 3039 | 0 |
| Together | 0 | 177358 | 136336 | 26841 | 4523 |
| Widow | 29536 | 21111 | 4678 | 0 | 0 |
| People_household | 1 | 2 | 3 | 4 | 5 |
|---|---|---|---|---|---|
| Marital | |||||
| Divorced | 56 | 119 | 49 | 5 | 0 |
| Married | 0 | 227 | 443 | 167 | 16 |
| Single | 168 | 221 | 73 | 14 | 0 |
| Together | 0 | 150 | 294 | 109 | 15 |
| Widow | 26 | 33 | 17 | 0 | 0 |
In [135]:
# ------------------------------------------- Marital Status & family analysis graphics
# -------------------------------COUNT Graphs
Tmess('Total customers by marital status and household size', Color='firebrick', Align='center', Size=15, Weight='bold')
# Barplot
plt.figure(figsize=(15, 4))
plt.subplot(1, 2, 1)
sns.countplot(data=df_child, x='Marital', hue='People_household', palette='icefire', order=sorted(df['Marital'].unique()))
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.grid(True, linestyle='-', linewidth=0.7, zorder=0) # Grille en arrière-plan
plt.gca().set_axisbelow(True)
# Cumulative Barplot
plt.subplot(1, 2, 2)
Df_Marital_count.plot(kind='bar', stacked=True, colormap='icefire', ax=plt.gca())
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.grid(True, linestyle='-', linewidth=0.7, zorder=0) # Grille en arrière-plan
plt.gca().set_axisbelow(True)
plt.tight_layout()
plt.show()
# Comments with calculated percentages
One_child = df_child[(df_child['Children'] == 1)]['Marital'].count()
Couple_Customer = df_child[(df_child['Marital'] == 'Married') | (df_child['Marital'] == 'Together')]['Marital'].count()
Single_Customer = df_child[(df_child['Marital'] != 'Married') & (df_child['Marital'] != 'Together')]['Marital'].count()
Tmess('Comments', Size=14, Color='firebrick', Weight='bold')
Tmess('👉 Married, Together represent {} % of the customer base'.format(round(Couple_Customer *100/ len(df_child),2)), Size=12, Color='blue')
Tmess('👉 The different types of single (divorced/widow) represent {} %.'.format(round(Single_Customer*100/ len(df_child),2)), Size=12, Color='blue')
Tmess('👉 Households with 1 child represent the majority of the clientele {} %'.format(round(One_child *100/ len(df_child),2)), Size=12, Color='blue')
# ---------------------------------------EXPENSES Graphs
Tmess('Total expenses by marital status and household size', Color='firebrick', Align='center', Size=15, Weight='bold')
# Creating the grid
plt.figure(figsize=(15, 4))
# Barplot
plt.subplot(1, 2, 1)
sns.barplot(data=Df_Marital_exp, x='Marital', y='Count', hue='People_household', palette='icefire')
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.grid(True, linestyle='-', linewidth=0.7, zorder=0) # Grille en arrière-plan
plt.gca().set_axisbelow(True)
# Cumulative barplot
plt.subplot(1, 2, 2)
Df_Marital_expenses.plot(kind='bar', stacked=True, colormap='icefire', ax=plt.gca())
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.grid(True, linestyle='-', linewidth=0.7, zorder=0) # Grille en arrière-plan
plt.gca().set_axisbelow(True)
plt.tight_layout()
plt.show()
# Comments with calculated percentages
Total = Df_Marital_exp['Count'].sum()
sing = df_child.loc[(df_child['Adult_household'] == 1) & (df_child['Children'] == 0), 'MntTotal'].sum()
coup = df_child.loc[(df_child['Adult_household'] == 2) & (df_child['Children'] == 0), 'MntTotal'].sum()
Child1 = df_child.loc[(df_child['Adult_household'] == 1) & (df_child['Children'] == 1) , 'MntTotal'].sum()
Child2 = df_child.loc[(df_child['Adult_household'] == 2) & (df_child['Children'] == 1) , 'MntTotal'].sum()
Child3 = df_child.loc[df_child['Children'] > 1 , 'MntTotal'].sum()
tcoup = df_child.loc[df_child['Adult_household'] == 2, 'MntTotal'].sum()
Tmess('Comments', Size=14, Color='firebrick', Weight='bold')
Tmess("👉 Couples (married or not) spent on their own {} € - {} %".format(tcoup, round(tcoup *100/ Total,2)), Size=12, Color='blue')
Tmess("👉 Households without children spent {} €, with {} % for couples and {} % for the others.".format(sing+coup,
round(coup *100/ Total,2),round(sing *100/ Total,2)), Size=12, Color='blue')
Tmess("👉 Households with 1 child spent €{}, with {}% for couples and {}% for the others".format(Child1 + Child2,
round(Child2 *100/ Total,2), round(Child1 *100/ Total,2)), Size=12, Color='blue')
Total customers by marital status and household size
Comments
👉 Married, Together represent 64.53 % of the customer base
👉 The different types of single (divorced/widow) represent 35.47 %.
👉 Households with 1 child represent the majority of the clientele 50.41 %
Total expenses by marital status and household size
Comments
👉 Couples (married or not) spent on their own 846880 € - 63.37 %
👉 Households without children spent 694302 €, with 31.08 % for couples and 20.88 % for the others.
👉 Households with 1 child spent €526689, with 26.42% for couples and 12.99% for the others
In [137]:
# Counting the number of households by number of children
df_counts = df_child['Children'].value_counts(normalize=True).reset_index()
df_counts.columns = ['Children', 'Percentage']
df_counts['Percentage'] = round(df_counts['Percentage'] * 100, 2)
df_counts = df_counts.sort_values(by='Children').reset_index()
df_counts.drop('index', axis=1, inplace=True)
# Visualization
Tmess('Distribution of households by number of children (%)', Color='firebrick', Align='center', Size=15, Weight='bold')
plt.figure(figsize=(8, 3))
sns.barplot(x='Children', y='Percentage', data=df_counts, palette='icefire')
plt.xlabel('Number of children.', fontsize=12)
plt.ylabel('Pourcentage (%)', fontsize=12)
plt.ylim(0, 60) # Ajuster la limite de l'axe Y à 100 pour les pourcentages
# Display the %.
for i, row in df_counts.iterrows():
plt.text(i, row['Percentage'] + 1, f"{row['Percentage']}%", ha='center', color='firebrick', fontweight='bold')
plt.show()
Distribution of households by number of children (%)
In [138]:
# ----------------------------------------- Spending Distribution by Household Composition
plt.figure(figsize=(15, 4))
Tmess('Distribution of expenses based on household size according to marital status.', Color='firebrick', Align='center', Size=15, Weight='bold')
# Subplot: Expenses for single people with children
plt.subplot(1, 2, 1)
sns.boxplot(x='SingleChildren', y='MntTotal', data=df_child, palette='icefire')
plt.title('Single Person Expenses')
plt.xlabel('Number of Children')
# Subplot: Expenses for couples with children
plt.subplot(1, 2, 2)
sns.boxplot(x='CoupleChildren', y='MntTotal', data=df_child, palette='icefire')
plt.title('Couple Expenses')
plt.xlabel('Number of Children')
plt.tight_layout()
plt.show()
# Comments
Tmess('Comments', Size=14, Color='firebrick', Weight='bold')
Tmess("👉 From two children, there is a greater financial ease for the 'Married' category", Size=12, Color='blue')
# ----------------------------------------- Spending Distribution by Household Size
plt.figure(figsize=(15, 4))
Tmess('Spending Distribution based on household size', Color='firebrick', Align='center', Size=15, Weight='bold')
# List of variables for subplots
variables = [
('Adult_household', 'Number of Adults', 'Spending Distribution by Number of Adults without child'),
('Children', 'Number of Children', 'Spending Distribution by Number of Children'),
('People_household', 'Number of People', 'Spending Distribution by Total Number of Persons')]
# Automatic generation of subplots
for i, (col, xlabel, title) in enumerate(variables, start=1):
plt.subplot(1, 3, i)
sns.boxplot(x=col, y='MntTotal', data=df if col == 'People_household' else df_child, palette='icefire')
plt.title(title)
plt.xlabel(xlabel)
plt.tight_layout()
plt.show()
# Comments
Tmess('Comments', Size=14, Color='firebrick', Weight='bold')
Tmess("👉 Households without children or with a limited number of adults tend to have higher average spending.", Size=12, Color='blue')
Tmess("👉 Budget constraints clearly increase with household size (adults + children).", Size=12, Color='blue')
Tmess("🎯 These observations can guide targeted marketing strategies based on household composition. For instance, child-free households might be more receptive to premium offers, while larger families might prefer budget-friendly options.", Size=12, Color='blue')
# List of variables for subplots
variables = [
('Adult_household', 'Number of Adults', 'Purchase Distribution by Number of Adults'),
('Children', 'Number of Children', 'Purchase Distribution by Number of Children'),
('People_household', 'Number of People', 'Purchase Distribution by Total Number of Persons')]
Tmess('Purchase Distribution based on household size', Color='firebrick', Align='center', Size=15, Weight='bold')
plt.figure(figsize=(15, 4))
# Automatic generation of subplots
for i, (col, xlabel, title) in enumerate(variables, start=1):
plt.subplot(1, 3, i)
sns.boxplot(x=col, y='Total_Purchases', data=df if col == 'People_household' else df_child, palette='icefire')
plt.title(title)
plt.xlabel(xlabel)
plt.tight_layout()
plt.show()
Distribution of expenses based on household size according to marital status.
Comments
👉 From two children, there is a greater financial ease for the 'Married' category
Spending Distribution based on household size
Comments
👉 Households without children or with a limited number of adults tend to have higher average spending.
👉 Budget constraints clearly increase with household size (adults + children).
🎯 These observations can guide targeted marketing strategies based on household composition. For instance, child-free households might be more receptive to premium offers, while larger families might prefer budget-friendly options.
Purchase Distribution based on household size
In [140]:
# Lists of variables and columns
columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
variables = [('People_household', 'Number of People', 'Expenses Distribution by Total Number of Persons'),
('Adult_household', 'Number of Adults', 'Expenses Distribution by Number of Adults'),
('Children', 'Number of Children', 'Expenses Distribution by Number of Children')]
# Visualization
plt.figure(figsize=(18, 18))
# Automatic generation of subplots
total_plots = len(columns) * len(variables)
for i, col in enumerate(columns):
for j, (x_col, xlabel, title) in enumerate(variables):
plt_index = i * len(variables) + j + 1 # Subplot position
plt.subplot(len(columns), len(variables), plt_index)
sns.boxplot(x=x_col, y=col, data=df_child, palette='icefire')
plt.title(title if i == 0 or i == 3 else "", fontsize=16, color='firebrick')
plt.xlabel(xlabel) # plt.xlabel(xlabel if i == len(columns) - 1 else "")
plt.ylabel(col if j == 0 else "", size = 15)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
In [142]:
# Lists of variables and columns
columns = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
variables = [('People_household', 'Number of People', 'Purchase Distribution by Total Number of Persons'),
('Adult_household', 'Number of Adults', 'Purchase Distribution by Number of Adults'),
('Children', 'Number of Children', 'Purchase Distribution by Number of Children')]
# Visualization
plt.figure(figsize=(18, 18))
# Automatic generation of subplots
total_plots = len(columns) * len(variables)
for i, col in enumerate(columns):
for j, (x_col, xlabel, title) in enumerate(variables):
plt_index = i * len(variables) + j + 1
plt.subplot(len(columns), len(variables), plt_index)
sns.boxplot(x=x_col, y=col, data=df_child, palette='icefire')
plt.title(title if i == 0 or i == 3 else "", fontsize=16, color='firebrick')
plt.xlabel(xlabel if i == len(columns) - 1 else "")
plt.ylabel(col if j == 0 else "", size = 15)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
📘 Explanation of Correlation Method Choice¶
We observed that some variables have asymmetric distributions.
Additionally, there are several binary or ordinal variables in the dataset.
➡️ Pearson is not suitable in this context.
➡️ Spearman or Kendall are better suited for our DataFrame.
💡 I chose Kendall because it is more robust and our DataFrame is relatively small.
In [145]:
# Correlation Matrix Calculation with kendall method
matrix = df.select_dtypes(include=np.number).corr(method = 'kendall')
# plot clustered heatmap of correlations
sns.clustermap(matrix, cbar_pos=(-0.05, 0.15, 0.03, 0.7), cmap='coolwarm', center=0, figsize=(12, 8))
# Creating a mask to show only the top triangle
mask = np.triu(np.ones_like(matrix, dtype=bool))
# Heatmap display with a mask to see only the top triangle
message = "Correlation Matrix with Kendall"
Tmess(message, Align='center', Color='Firebrick', Size='16', Weight='Bold')
plt.figure(figsize=(20, 10))
sns.heatmap(matrix, mask=mask, annot=True, cmap='coolwarm', fmt=".2f", cbar_kws={"shrink": .8})
plt.show()
Correlation Matrix with Kendall
In [148]:
# Select relevant variables for clustering
df_clust = df_child.drop('Marital',axis=1)
features_for_clustering = ['Income',
'MntTotal', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts','MntGoldProds',
'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'Total_Purchases',
'Education_level', 'Adult_household', 'People_household']
In [150]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
# Reducing the dataset
#df_sample = df_encod.sample(n=10000, random_state=42)
X = df_clust[features_for_clustering]
# Data standardization
scaler = MinMaxScaler()
# Transformation logarithmique (log(1 + x))
#scaler = FunctionTransformer(np.log1p, validate=True) # Utilisation de log(1 + x) pour gérer les zéros
from sklearn.preprocessing import PowerTransformer
#scaler = PowerTransformer(method='yeo-johnson') # Ou 'box-cox'
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [160]:
# Calculation of the elbow method and the silhouette score
inertias = []
silhouette_scores = []
k_values = range(2, 12)
for k in k_values:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
inertias.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
# Visualize the curve of the elbow / Silhouette score
plt.figure(figsize=(10, 3))
plt.subplot(1, 2, 1)
plt.plot(k_values, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method', color='firebrick')
plt.subplot(1, 2, 2)
plt.plot(k_values, silhouette_scores, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Silhouette score', color='firebrick')
plt.tight_layout()
plt.show()
# Displaying high scores
optimal_k_inertia = k_values[inertias.index(min(inertias))]
optimal_k_silhouette = k_values[silhouette_scores.index(max(silhouette_scores))]
Tmess("Optimal number of clusters according to these curves: {}".format(optimal_k_silhouette), Color="blue", Size = 12)
Optimal number of clusters according to these curves: 2
In [170]:
k=8
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df_clust['Cluster'] = clusters
Tmess(f"\nCluster analysis with k={k}", Color="blue", Size = 12)
df_clust.groupby('Cluster').mean()
Cluster analysis with k=8
Out[170]:
| Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Age | Customer_Days | AcceptedCmpOverall | MntTotal | MntRegularProds | Education_level | Adult_household | People_household | Total_Purchases | Children | SingleChildren | CoupleChildren | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Cluster | ||||||||||||||||||||||||||||||||||
| 0 | 81070.678322 | 0.013986 | 0.034965 | 47.678322 | 793.069930 | 89.797203 | 714.132867 | 110.090909 | 81.615385 | 76.594406 | 1.000000 | 5.048951 | 6.678322 | 8.174825 | 2.615385 | 0.118881 | 0.146853 | 0.398601 | 0.314685 | 0.013986 | 0.000000 | 0.454545 | 50.447552 | 406.237762 | 0.993007 | 1865.300699 | 1788.706294 | 3.580420 | 1.657343 | 1.706294 | 20.902098 | 0.048951 | 0.013986 | 0.034965 |
| 1 | 34554.350000 | 0.772222 | 0.466667 | 49.097222 | 41.811111 | 5.163889 | 23.305556 | 6.725000 | 5.050000 | 16.083333 | 2.152778 | 2.127778 | 0.580556 | 3.183333 | 6.438889 | 0.080556 | 0.008333 | 0.000000 | 0.002778 | 0.000000 | 0.011111 | 0.130556 | 48.680556 | 331.922222 | 0.091667 | 98.138889 | 82.055556 | 3.352778 | 1.000000 | 2.238889 | 8.044444 | 1.238889 | 1.238889 | 0.000000 |
| 2 | 75207.589189 | 0.016216 | 0.113514 | 53.767568 | 527.232432 | 38.194595 | 444.459459 | 98.956757 | 51.610811 | 77.783784 | 1.216216 | 4.329730 | 6.064865 | 8.113514 | 2.464865 | 0.048649 | 0.070270 | 0.151351 | 0.183784 | 0.027027 | 0.005405 | 0.286486 | 54.000000 | 357.367568 | 0.481081 | 1238.237838 | 1160.454054 | 3.400000 | 1.378378 | 1.508108 | 19.724324 | 0.129730 | 0.113514 | 0.016216 |
| 3 | 70806.629213 | 0.095506 | 0.477528 | 48.578652 | 433.056180 | 103.500000 | 281.775281 | 120.764045 | 101.825843 | 97.775281 | 2.117978 | 6.140449 | 4.893258 | 8.786517 | 3.657303 | 0.056180 | 0.050562 | 0.117978 | 0.101124 | 0.011236 | 0.016854 | 0.089888 | 50.213483 | 379.028090 | 0.337079 | 1138.696629 | 1040.921348 | 2.971910 | 1.724719 | 2.297753 | 21.938202 | 0.573034 | 0.162921 | 0.410112 |
| 4 | 68811.727273 | 0.142857 | 0.718615 | 48.173160 | 780.515152 | 27.506494 | 239.047619 | 39.437229 | 30.917749 | 59.255411 | 2.909091 | 6.645022 | 5.025974 | 9.415584 | 4.896104 | 0.112554 | 0.225108 | 0.194805 | 0.129870 | 0.064935 | 0.004329 | 0.212121 | 54.852814 | 408.653680 | 0.727273 | 1176.679654 | 1117.424242 | 4.004329 | 1.961039 | 2.822511 | 23.995671 | 0.861472 | 0.038961 | 0.822511 |
| 5 | 57645.307292 | 0.182292 | 0.760417 | 45.416667 | 461.791667 | 23.916667 | 138.520833 | 29.166667 | 21.515625 | 61.640625 | 3.218750 | 6.671875 | 3.109375 | 7.760417 | 5.781250 | 0.078125 | 0.166667 | 0.041667 | 0.026042 | 0.020833 | 0.005208 | 0.182292 | 54.380208 | 416.854167 | 0.333333 | 736.552083 | 674.911458 | 3.723958 | 1.005208 | 1.947917 | 20.760417 | 0.942708 | 0.942708 | 0.000000 |
| 6 | 52553.410256 | 0.400641 | 0.807692 | 49.483974 | 271.307692 | 14.762821 | 87.108974 | 22.426282 | 15.096154 | 48.916667 | 3.673077 | 5.525641 | 2.028846 | 6.224359 | 5.759615 | 0.051282 | 0.086538 | 0.006410 | 0.028846 | 0.000000 | 0.009615 | 0.089744 | 53.448718 | 375.750000 | 0.173077 | 459.618590 | 410.701923 | 3.621795 | 1.974359 | 3.182692 | 17.451923 | 1.208333 | 0.064103 | 1.144231 |
| 7 | 33121.763727 | 0.798669 | 0.455907 | 49.206323 | 30.605657 | 4.118136 | 18.136439 | 5.951747 | 4.445923 | 13.013311 | 1.898502 | 1.863561 | 0.427621 | 3.084859 | 6.492512 | 0.068220 | 0.011647 | 0.000000 | 0.000000 | 0.003328 | 0.011647 | 0.066556 | 48.366057 | 326.033278 | 0.083195 | 76.271215 | 63.257903 | 3.276206 | 2.000000 | 3.254576 | 7.274542 | 1.254576 | 0.000000 | 1.254576 |
In [172]:
pca = PCA().fit(X_scaled) # Pas besoin de fixer n_components au début
explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
Tmess("Choice of the number of components", Align='center', Color='Firebrick', Size='16', Weight='Bold')
plt.figure(figsize=(10, 3))
plt.plot(explained_variance_ratio)
plt.axhline(y=0.90, color='red', linestyle='--', label='90% Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()
Choice of the number of components
In [174]:
cmap = 'icefire'
# Réduction de dimensionnalité avec PCA
pca = PCA(n_components=7)
X_pca = pca.fit_transform(X_scaled)
# Réduction de dimensionnalité avec t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
# Calculer les centroïdes des clusters
centroids_pca = np.array([X_pca[clusters == i].mean(axis=0) for i in np.unique(clusters)])
centroids_tsne = np.array([X_tsne[clusters == i].mean(axis=0) for i in np.unique(clusters)])
# Créer la subplot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
# Plot PCA
scatter1 = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap=cmap, s=10)
ax1.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c='red', marker='x', s=150, linewidths=5, label='Centroids')
ax1.set_xlabel('PCA Component 1')
ax1.set_ylabel('PCA Component 2')
ax1.set_title('Clusters visualized in 2D with PCA', color='firebrick', size=20)
ax1.legend(*scatter1.legend_elements(), title="Clusters",
bbox_to_anchor=(1.05, 1), loc='upper left')
# Plot t-SNE
scatter2 = ax2.scatter(X_tsne[:, 0], X_tsne[:, 1], c=clusters, cmap=cmap, s=10)
ax2.scatter(centroids_tsne[:, 0], centroids_tsne[:, 1], c='red', marker='x', s=150, linewidths=5, label='Centroids')
ax2.set_xlabel('t-SNE Component 1')
ax2.set_ylabel('t-SNE Component 2')
ax2.set_title('Clusters visualized in 2D with t-SNE', color='firebrick', size=20)
ax2.legend()
plt.tight_layout()
plt.show()
In [177]:
# Selection des colonnes
montant = ['Cluster', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
# Calcul des moyennes
df_clust_mean = df_clust.groupby('Cluster').mean().reset_index()
# Adaptation du df pour visualisation
melted_df = pd.melt(df_clust_mean[montant], id_vars="Cluster", var_name="Product", value_name="Consumption")
# Visualisation
Tmess("Average expenses by product", Align='center', Color='Firebrick', Size='16', Weight='Bold')
plt.figure(figsize=(12, 4))
sns.barplot(x="Cluster", y="Consumption", hue="Product", data=melted_df, ci=None, palette="icefire")
plt.xlabel("Cluster")
plt.ylabel("Average expenses")
plt.legend(title="Product", loc="lower left", bbox_to_anchor=(1, 0.5))
plt.show()
Average expenses by product
In [179]:
# Selection des colonnes
Campaign = ['Cluster', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
# Adaptation du df pour visualisation
melted_df = pd.melt(df_clust_mean[Campaign], id_vars="Cluster", var_name="Product", value_name="Consumption")
# Visualisation
Tmess("Average of Accepted Campaigns", Align='center', Color='Firebrick', Size='16', Weight='Bold')
plt.figure(figsize=(12, 4))
sns.barplot(x="Cluster", y="Consumption", hue="Product", data=melted_df, ci=None, palette="icefire")
plt.xlabel("Cluster")
plt.ylabel("Average Campaign Acceptance")
plt.legend(title="Product", loc="lower left", bbox_to_anchor=(1, 0.5))
plt.show()
Average of Accepted Campaigns
In [180]:
# Analyze cluster characteristics
cluster_df = pd.DataFrame(X_scaled, columns=features_for_clustering)
cluster_df['Cluster'] = clusters
# Cluster Averages
cluster_means = cluster_df.groupby('Cluster').mean()
cluster_means
Out[180]:
| Income | MntTotal | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | Total_Purchases | Education_level | Adult_household | People_household | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Cluster | |||||||||||||||
| 0 | 1.423162 | 2.093221 | 1.441625 | 1.592286 | 2.551984 | 1.318270 | 1.323631 | 0.627785 | 0.344740 | 1.471926 | 0.724109 | 0.793123 | 0.121286 | 0.025125 | -0.980486 |
| 1 | -0.829838 | -0.846174 | -0.784593 | -0.534634 | -0.657052 | -0.566854 | -0.537587 | -0.541845 | -0.723535 | -0.748579 | -0.818319 | -0.902717 | -0.105496 | -1.348874 | -0.392530 |
| 2 | 1.139185 | 1.050201 | 0.653865 | 0.295460 | 1.299295 | 1.115212 | 0.594254 | 0.650774 | 0.081721 | 1.248536 | 0.705163 | 0.637782 | -0.058452 | -0.557975 | -1.199272 |
| 3 | 0.926027 | 0.884630 | 0.374791 | 1.936651 | 0.543593 | 1.512920 | 1.814925 | 1.037193 | 0.743902 | 0.821895 | 0.913128 | 0.929778 | -0.484924 | 0.165958 | -0.327548 |
| 4 | 0.829404 | 0.947808 | 1.404422 | 0.026858 | 0.345114 | 0.029731 | 0.091229 | 0.292636 | 0.928425 | 0.870223 | 1.107517 | 1.201145 | 0.543593 | 0.659921 | 0.251756 |
| 5 | 0.288563 | 0.215726 | 0.459943 | -0.063358 | -0.121854 | -0.157577 | -0.137326 | 0.338740 | 0.938245 | 0.172292 | 0.596052 | 0.774436 | 0.264282 | -1.337988 | -0.713748 |
| 6 | 0.041939 | -0.244909 | -0.104521 | -0.293404 | -0.360672 | -0.280504 | -0.293376 | 0.092797 | 0.519066 | -0.221183 | 0.121393 | 0.338067 | 0.162504 | 0.687763 | 0.649376 |
| 7 | -0.899225 | -0.882547 | -0.817798 | -0.560915 | -0.681064 | -0.580956 | -0.552272 | -0.601186 | -0.820160 | -0.804270 | -0.848748 | -1.004262 | -0.181778 | 0.741359 | 0.728732 |
In [185]:
# Main function to display boxplots by cluster
def plot_cluster_boxplots(df, columns, cluster_col='Cluster'):
"""
Create boxplots for each variable by cluster.
"""
n_cols = 3 # Nombre de boxplots par ligne
n_rows = (len(columns) // n_cols) + (len(columns) % n_cols > 0)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 4))
axes = axes.flatten()
for i, col in enumerate(columns):
ax = axes[i]
sns.boxplot(x=cluster_col, y=col, data=df, ax=ax, palette="icefire")
ax.set_title(f'Distribution de {col} par Cluster', fontsize=14, fontweight='bold')
ax.set_xlabel('Cluster', fontsize=12)
ax.set_ylabel(col, fontsize=12)
# Remove empty axes
for ax in axes[len(columns):]:
ax.set_visible(False)
plt.tight_layout()
plt.show()
# List of columns to display
columns = ['Income', 'Kidhome', 'Teenhome', 'Education_level', 'Adult_household', 'People_household', 'Age', 'SingleChildren', 'CoupleChildren']
# Displaying boxplots for all variables
plot_cluster_boxplots(df_clust, columns)
In [189]:
import plotly.express as px
import plotly.graph_objects as go
'''
['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
'orrd', 'oryel', 'oxy', 'peach', 'phase', 'picnic', 'pinkyl',
'piyg', 'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn',
'puor', 'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu',
'rdgy', 'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar',
'spectral', 'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn',
'tealrose', 'tempo', 'temps', 'thermal', 'tropic', 'turbid',
'turbo', 'twilight', 'viridis', 'ylgn', 'ylgnbu', 'ylorbr',
'ylorrd'].
'''
# Création du graphique interactif
fig = px.scatter(
df_clust,
x='Cluster',
y='MntTotal',
size='Education_level',
color='People_household',
#facet_col='Education_level',
color_continuous_scale='purples',
size_max= 10,
hover_data=['People_household', 'Education_level', 'MntTotal'],
title='Distribution par Cluster'
)
# Personnalisation du graphique
fig.update_layout(
title={'text': 'Distribution par Cluster','x': 0.5,'font': {'size': 20, 'color': 'firebrick'}},
xaxis_title='Cluster',
yaxis_title='MntTotal',
legend_title='Education Level',
width=1100,
height=500,
showlegend=True,
legend={'x': 1.05, 'y': 1},
plot_bgcolor='white', # Fond du graphique
paper_bgcolor='white'
)
# Ajouter une grille
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
# Ajouter le contour noir aux points
fig.update_traces(marker=dict(line=dict(color='black', width=0.8)))
# Afficher le graphique
fig.show()
In [191]:
# Fonction principale pour afficher les boxplots par variable pour chaque cluster
def plot_boxplots_by_cluster(df, columns, cluster_col='Cluster'):
"""
Crée des boxplots pour chaque variable pour chaque cluster.
"""
clusters = df[cluster_col].unique()
for cluster in clusters:
# Filtrer uniquement les données du cluster courant
df_cluster = df[df[cluster_col] == cluster]
n_cols = len(columns)
n_rows = (len(columns) // n_cols) + (len(columns) % n_cols > 0)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))
axes = axes.flatten()
Tmess(f'Identity card - Cluster {cluster} - contains {df_cluster.shape[0]} individuals', Size=18, Weight='bold', Color='firebrick', Align='center')
for i, col in enumerate(columns):
ax = axes[i]
sns.boxplot(y=col, data=df_cluster, ax=ax, palette="terrain")
# Afficher les informations
ax.set_title(f'{col}', fontsize=14, fontweight='bold')
ax.set_xlabel('')
ax.set_ylabel(col, fontsize=12)
# Ajouter la médiane en rouge et épaisse
median_value = df_cluster[col].median()
ax.axhline(y=median_value, color='firebrick', linestyle='-', linewidth=2)
mean_value = df_cluster[col].mean()
ax.axhline(y=mean_value, color='teal', linestyle='-', linewidth=2)
ax.text(x=-0.4, y=median_value, s='med', color='firebrick', fontsize=12, fontweight='bold')
ax.text(x=0.2, y=mean_value, s='mean', color='teal', fontsize=12, fontweight='bold')
# Supprimer les axes vides
for ax in axes[len(columns):]:
ax.set_visible(False)
plt.tight_layout()
plt.show()
# Création du scatter plot si les colonnes nécessaires existent
required_cols = ['People_household', 'MntTotal', 'Adult_household', 'Education_level']
if all(col in df.columns for col in required_cols):
df_HH = df_cluster.copy()
# Ajout d'un petit décalage pour une meilleure visualisation
df_HH.loc[df_HH['Adult_household'] == 2, 'People_household'] += 0.05
df_HH.loc[df_HH['Adult_household'] == 1, 'People_household'] -= 0.05
if len(df_HH) > 0:
plt.figure(figsize=(12, 3))
scatter = sns.scatterplot(
data=df_HH,
x='People_household',
y='MntTotal',
size='Adult_household',
hue='Education_level',
palette='icefire',
sizes=(30, 80)
)
plt.title('Relation People_household vs MntTotal', fontsize=12, color='firebrick')
plt.ylabel('MntTotal', fontsize=12)
plt.xlabel('People_household', fontsize=12)
plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.xticks(np.arange(1, 6))
plt.tight_layout()
plt.show()
Tmess('\n -------------------------------------------- \n', Align='center')
In [193]:
df_clust = df_clust.sort_values(by='Cluster', ascending=True)
# List of columns to display
columns = ['MntTotal', 'Total_Purchases', 'Income', 'Education_level', 'Age', 'Kidhome', 'Teenhome', 'Adult_household', 'People_household', 'SingleChildren', 'CoupleChildren']
# Displaying boxplots by cluster
plot_boxplots_by_cluster(df_clust, columns)
Identity card - Cluster 0 - contains 143 individuals
--------------------------------------------
Identity card - Cluster 1 - contains 360 individuals
--------------------------------------------
Identity card - Cluster 2 - contains 185 individuals
--------------------------------------------
Identity card - Cluster 3 - contains 178 individuals
--------------------------------------------
Identity card - Cluster 4 - contains 231 individuals
--------------------------------------------
Identity card - Cluster 5 - contains 192 individuals
--------------------------------------------
Identity card - Cluster 6 - contains 312 individuals
--------------------------------------------
Identity card - Cluster 7 - contains 601 individuals
--------------------------------------------
Cluster Analysis¶
Cluster 0¶
- Average Spending: €1,850 across approximately 20 purchases.
- Composition: Mainly couples without children.
- Income: Significant (average €80k).
- Education: Level 3+.
Cluster 1¶
- Average Spending: €100 across 5 to 10 purchases.
- Composition: Single individuals with 1 or 2 children.
- Income: Relatively low (average €35k).
- Education: Level 3+.
Cluster 2¶
- Average Spending: €1,250 across approximately 20 purchases.
- Composition: Mainly single individuals without children.
- Income: High (average €75k).
- Education: Level 3+.
- Age: Higher.
Cluster 3¶
- Average Spending: €1,200 across approximately 20 purchases.
- Composition: Couples, potentially with 1 teenager.
- Income: High (average €70k).
- Education: Level 3.
Cluster 4¶
- Average Spending: €1,200 across 25 purchases.
- Composition: Couples with 1 teenager.
- Income: High (average €70k).
- Education: Level 4+.
- Age: Higher.
Cluster 5¶
- Average Spending: €750 across approximately 20 purchases.
- Composition: Single individuals with 1 teenager.
- Income: Good level (average €60k).
- Education: Level 3++.
- Age: Higher.
Cluster 6¶
- Average Spending: €450 across fewer than 20 purchases.
- Composition: Couples with at least 1 child.
- Income: Good level (average €55k).
- Education: Level 3++.
Cluster 7¶
- Average Spending: Less than €100 across fewer than 10 purchases.
- Composition: Exclusively couples with at least 1 young child, possibly 1 teenager.
- Income: Low (average €35k).
- Education: Level 3+.
In [195]:
columns = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
campaign = {}
# Crée des sous-graphiques (2 lignes, 3 colonnes)
fig, axes = plt.subplots(2, 3, figsize=(18, 8)) # 2 lignes x 3 colonnes, taille adaptée
for i, col in enumerate(columns):
campaign[col] = df_clust[df_clust[col] == 1]
campaign_clust = campaign[col].groupby(['Cluster']).size().reset_index(name='count')
row = i // 3
col_pos = i % 3
sns.barplot(ax=axes[row, col_pos], data=campaign_clust, x='Cluster', y='count', palette='icefire')
x_pos = 0.7 # Position x au centre de l'axe
y_pos = axes[row, col_pos].get_ylim()[1] * 0.95
axes[row, col_pos].text(x_pos, y_pos, f'Total accepted: {campaign[col].shape[0]}',
fontsize=14, color='black', fontweight='bold')
axes[row, col_pos].set_title(col, fontsize=18, color='firebrick', fontweight='bold')
axes[row, col_pos].set_xlabel('Cluster')
axes[row, col_pos].set_ylabel('Count')
# Ajustement de l'espacement entre les sous-graphiques
plt.tight_layout()
plt.show()
In [198]:
# Creating dfs for analysis
cluster_dfs = {}
for i in range (k):
df_cluster = df_clust[df_clust['Cluster']==i]
cluster_dfs[i] = df_cluster[columns]
In [200]:
for i in range(k):
plt.figure(figsize=(10, 3))
# Filtrer les colonnes dont la somme est différente de 0
filtered_columns = cluster_dfs[i].loc[:, cluster_dfs[i].sum() != 0].columns
# Créer le dictionnaire des positions
x_positions = {col: idx for idx, col in enumerate(filtered_columns)}
for col in cluster_dfs[i].columns:
# Récupérer les index où la valeur est 1
indices = cluster_dfs[i][cluster_dfs[i][col] == 1].index
if len(indices) > 0: # S'il y a des 1 dans cette colonne
plt.scatter([col] * len(indices), indices, label=col, alpha=0.8, s=20)
# Ajouter le nombre de points au-dessus de la colonne, décalé vers la droite
plt.text(x_positions[col] + 0.1, cluster_dfs[i].index.max() - 200, f'{len(indices)}', color='firebrick',
ha='left', va='bottom')
plt.title(f'Cluster {i} - Campagnes acceptées', color='firebrick', size=14)
plt.xlabel('')
plt.ylabel('Customers Index')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks('')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.ylim(cluster_dfs[i].index.min() - 50, cluster_dfs[i].index.max() + 100)
plt.tight_layout()
plt.show()
In [202]:
clustnb = 5
# Création du graphique
plt.figure(figsize=(12, 2.5))
scatter = sns.scatterplot(
data=df_clust[df_clust['Cluster']==clustnb],
x='MntTotal',
y='Cluster',
size='People_household', # Taille des bulles
hue='Education_level', # Couleur basée sur les exports
palette='icefire',
sizes=(10, 100), # Taille min et max des bulles
legend='brief'
)
# Personnalisation
plt.title('Cluster ', fontsize=16, color='firebrick')
plt.xlabel('Cluster', fontsize=12)
plt.ylabel('MntTotal', fontsize=12)
plt.legend(title='People_household', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='x', linestyle='--', alpha=0.7)
# Afficher le graphique
plt.tight_layout()
plt.show()
In [237]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.metrics import classification_report
In [239]:
# Préparation des données
df_camp = df_Prepared.copy()
df_camp['Marital'] = df_camp['Marital'].replace({'Single': 1, 'Married': 2, 'Divorced': 3, 'Together': 4, 'Widow': 5})
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'AcceptedCmpOverall']
colaccep = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'AcceptedCmp1', 'Response']
In [242]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer
# Data standardization
scaler = MinMaxScaler()
# Transformation logarithmique (log(1 + x))
#scaler = FunctionTransformer(np.log1p, validate=True) # Utilisation de log(1 + x) pour gérer les zéros
#scaler = PowerTransformer(method='yeo-johnson') # Ou 'box-cox'
#scaler = StandardScaler()
In [244]:
scaled_data = scaler.fit_transform(df_camp)
df_camp = pd.DataFrame(scaled_data, columns=df.columns)
In [246]:
# Filtrer les colonnes numériques
campain1 = df_camp.drop(coldel, axis=1)
# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain1.corr(method='kendall')['AcceptedCmp1'].sort_values(ascending=False)
# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']
# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp1']
# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp1 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [248]:
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1',
'AcceptedCmpOverall', 'Age', 'Adult_household', 'Education_level', 'Customer_Days', 'Recency', 'Marital']
In [250]:
df_camp.columns
Out[250]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
dtype='object')
In [254]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# SMOTE pour équilibrer les classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Utilisation de ADASYN à la place de SMOTE
#adasyn = ADASYN(random_state=42)
#X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [256]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
'n_estimators': [200, 220],
'max_depth': [10, 12],
'learning_rate': [0.1],
'scale_pos_weight': [ratio_of_classes], # Essayer différents poids
'colsample_bytree': [0.5, 0.7],
'subsample': [0.6, 0.8]
}
# Définir la grille de recherche pour Random Forest
param_grid_rf = {
'n_estimators': [200, 220],
'max_depth': [None],
'class_weight': ['balanced']
}
# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)
# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)
# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp1 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp1.fit(X_resampled, y_resampled)
# Prédictions et évaluation
y_pred = model_Camp1.predict(X_test)
print(classification_report(y_test, y_pred))
# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'scale_pos_weight': 2.0, 'subsample': 0.8}
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'n_estimators': 200}
precision recall f1-score support
0.0 0.97 0.95 0.96 414
1.0 0.46 0.59 0.52 27
accuracy 0.93 441
macro avg 0.72 0.77 0.74 441
weighted avg 0.94 0.93 0.94 441
F1-score: 0.5161290322580645
MCC: 0.484878795449813
In [258]:
from sklearn.metrics import roc_curve, auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp1.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
In [260]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp1, "model_Camp1.joblib")
# Chargement du modèle
model = load("model_Camp1.joblib")
model
Out[260]:
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced',
n_estimators=200,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, ga...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=200, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced',
n_estimators=200,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, ga...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=200, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')RandomForestClassifier(class_weight='balanced', n_estimators=200,
random_state=42)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.7, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=10, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=200, n_jobs=None,
num_parallel_tree=None, random_state=42, ...)In [261]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1',
'AcceptedCmpOverall', 'Age', 'Adult_household', 'Education_level', 'Customer_Days', 'Recency', 'Marital']
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp1']
# Effectuer les prédictions
y_pred = model.predict(X)
# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp1'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9863760217983651 0.8733333333333333 0.9225352112676056 0.8972602739726028
precision recall f1-score support
0.0 0.99 0.99 0.99 2060
1.0 0.87 0.92 0.90 142
accuracy 0.99 2202
macro avg 0.93 0.96 0.94 2202
weighted avg 0.99 0.99 0.99 2202
AcceptedCmp1
0.0 0.935513
1.0 0.064487
Name: proportion, dtype: float64
[[2041 19]
[ 11 131]]
ROC-AUC: 0.9942738958019964
In [264]:
# Importance des variables pour Random Forest
rf_importances = best_rf.feature_importances_
rf_features = pd.DataFrame({
'Feature': X.columns,
'Importance': rf_importances
}).sort_values(by='Importance', ascending=False)
# Importance des variables pour XGBoost
xgb_importances = best_xgb.feature_importances_
xgb_features = pd.DataFrame({
'Feature': X.columns,
'Importance': xgb_importances
}).sort_values(by='Importance', ascending=False)
# Fusionner les importances
combined_importances = pd.DataFrame({
'Feature': X.columns,
'RandomForest_Importance': rf_importances,
'XGBoost_Importance': xgb_importances
}).set_index('Feature')
# Moyenne des importances pour chaque variable
combined_importances['Average_Importance'] = combined_importances.mean(axis=1)
# Trier par importance moyenne
combined_importances = combined_importances.sort_values(by='Average_Importance', ascending=False)
print("Importances combinées des variables:")
combined_importances
Importances combinées des variables:
Out[264]:
| RandomForest_Importance | XGBoost_Importance | Average_Importance | |
|---|---|---|---|
| Feature | |||
| Income | 0.190842 | 0.199219 | 0.195030 |
| MntRegularProds | 0.090077 | 0.202953 | 0.146515 |
| MntWines | 0.088519 | 0.077272 | 0.082896 |
| NumCatalogPurchases | 0.104712 | 0.047787 | 0.076249 |
| NumDealsPurchases | 0.035666 | 0.095532 | 0.065599 |
| MntTotal | 0.080213 | 0.030274 | 0.055244 |
| NumStorePurchases | 0.059832 | 0.036734 | 0.048283 |
| NumWebVisitsMonth | 0.042369 | 0.045146 | 0.043757 |
| MntMeatProducts | 0.059955 | 0.018466 | 0.039211 |
| MntFruits | 0.036686 | 0.040942 | 0.038814 |
| Total_Purchases | 0.050242 | 0.023149 | 0.036695 |
| MntGoldProds | 0.027894 | 0.026649 | 0.027271 |
| Teenhome | 0.015396 | 0.039016 | 0.027206 |
| NumWebPurchases | 0.031306 | 0.021557 | 0.026431 |
| MntSweetProducts | 0.033143 | 0.018160 | 0.025651 |
| MntFishProducts | 0.027289 | 0.020559 | 0.023924 |
| Kidhome | 0.008949 | 0.032364 | 0.020657 |
| People_household | 0.016911 | 0.024223 | 0.020567 |
In [268]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'AcceptedCmpOverall']
In [270]:
# Filtrer les colonnes numériques
campain2 = df_camp.drop(coldel, axis=1)
# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain2.corr(method='kendall')['AcceptedCmp2'].sort_values(ascending=False)
# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']
# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp2']
# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp2 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [271]:
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1',
'AcceptedCmpOverall', 'Age', 'MntFruits', 'MntFishProducts', 'MntSweetProducts', 'Recency', 'NumWebVisitsMonth',
'Adult_household', 'Customer_Days']
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1',
'AcceptedCmpOverall', 'MntFruits', 'MntFishProducts', 'MntSweetProducts', 'Recency', 'Customer_Days', 'Adult_household']
In [274]:
df_camp.columns
Out[274]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
dtype='object')
In [276]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp2']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [278]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
'n_estimators': [220, 250],
'max_depth': [8, 10],
'learning_rate': [0.1],
'scale_pos_weight': [ratio_of_classes], # Essayer différents poids
'colsample_bytree': [0.3, 0.5],
'subsample': [0.4, 0.6]
}
# Définir la grille de recherche pour Random Forest
param_grid_rf = {
'n_estimators': [180, 200],
'max_depth': [None],
'class_weight': ['balanced']
}
# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)
# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)
# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp2 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp2.fit(X_resampled, y_resampled)
# Prédictions et évaluation
y_pred = model_Camp2.predict(X_test)
print(classification_report(y_test, y_pred))
# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 250, 'scale_pos_weight': 2.0005763688760805, 'subsample': 0.6}
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'n_estimators': 180}
precision recall f1-score support
0.0 0.99 0.99 0.99 436
1.0 0.29 0.40 0.33 5
accuracy 0.98 441
macro avg 0.64 0.69 0.66 441
weighted avg 0.99 0.98 0.98 441
F1-score: 0.3333333333333333
MCC: 0.32912530817577573
In [280]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp2.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
In [282]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp2, "model_Camp2.joblib")
# Chargement du modèle
model2 = load("model_Camp2.joblib")
model2
Out[282]:
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced',
n_estimators=180,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.3, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, ga...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=250, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced',
n_estimators=180,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.3, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, ga...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=250, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')RandomForestClassifier(class_weight='balanced', n_estimators=180,
random_state=42)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.3, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=10, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=250, n_jobs=None,
num_parallel_tree=None, random_state=42, ...)In [286]:
coldel1 = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp2', 'Complain', 'Response', 'AcceptedCmp1',
'AcceptedCmpOverall', 'MntFruits', 'MntFishProducts', 'MntSweetProducts', 'Recency', 'Customer_Days', 'Adult_household']
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp2']
# Effectuer les prédictions
y_pred = model2.predict(X)
# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp2'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model2.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9963669391462306 0.84375 0.9 0.8709677419354839
precision recall f1-score support
0.0 1.00 1.00 1.00 2172
1.0 0.84 0.90 0.87 30
accuracy 1.00 2202
macro avg 0.92 0.95 0.93 2202
weighted avg 1.00 1.00 1.00 2202
AcceptedCmp2
0.0 0.986376
1.0 0.013624
Name: proportion, dtype: float64
[[2167 5]
[ 3 27]]
ROC-AUC: 0.9895794966236955
In [289]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'AcceptedCmpOverall']
In [291]:
# Filtrer les colonnes numériques
campain3 = df_camp.drop(coldel, axis=1)
# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain3.corr(method='kendall')['AcceptedCmp3'].sort_values(ascending=False)
# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']
# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp3']
# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp3 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [293]:
coldel1 = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain', 'Response', 'AcceptedCmpOverall']
In [295]:
df_camp.columns
Out[295]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
dtype='object')
In [297]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp3']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [299]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
'n_estimators': [100, 200],
'max_depth': [3, 6, 10],
'learning_rate': [0.01, 0.1],
'scale_pos_weight': [ratio_of_classes, 1], # Essayer différents poids
'colsample_bytree': [0.7, 1.0],
'subsample': [0.8, 1.0]
}
# Définir la grille de recherche pour Random Forest
param_grid_rf = {
'n_estimators': [100, 200],
'max_depth': [10, 20, None],
'class_weight': ['balanced', 'balanced_subsample']
}
# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)
# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)
# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp3 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp3.fit(X_resampled, y_resampled)
# Prédictions et évaluation
y_pred = model_Camp3.predict(X_test)
print(classification_report(y_test, y_pred))
# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 0.8}
Best parameters: {'class_weight': 'balanced_subsample', 'max_depth': 20, 'n_estimators': 200}
precision recall f1-score support
0.0 0.93 0.99 0.96 401
1.0 0.62 0.20 0.30 40
accuracy 0.92 441
macro avg 0.77 0.59 0.63 441
weighted avg 0.90 0.92 0.90 441
F1-score: 0.3018867924528302
MCC: 0.31840662051218027
In [300]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp3.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
In [301]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp3, "model_Camp3.joblib")
# Chargement du modèle
model3 = load("model_Camp3.joblib")
model3
Out[301]:
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced_subsample',
max_depth=20,
n_estimators=200,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=Non...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=200, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced_subsample',
max_depth=20,
n_estimators=200,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=Non...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=200, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')RandomForestClassifier(class_weight='balanced_subsample', max_depth=20,
n_estimators=200, random_state=42)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.7, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=10, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=200, n_jobs=None,
num_parallel_tree=None, random_state=42, ...)In [302]:
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp3']
# Effectuer les prédictions
y_pred = model3.predict(X)
# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp3'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model3.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.983197093551317 0.9632352941176471 0.803680981595092 0.8762541806020067
precision recall f1-score support
0.0 0.98 1.00 0.99 2039
1.0 0.96 0.80 0.88 163
accuracy 0.98 2202
macro avg 0.97 0.90 0.93 2202
weighted avg 0.98 0.98 0.98 2202
AcceptedCmp3
0.0 0.925976
1.0 0.074024
Name: proportion, dtype: float64
[[2034 5]
[ 32 131]]
ROC-AUC: 0.976829132529178
In [308]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp5', 'Response', 'AcceptedCmpOverall']
In [310]:
# Filtrer les colonnes numériques
campain4 = df_camp.drop(coldel, axis=1)
# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain4.corr(method='kendall')['AcceptedCmp4'].sort_values(ascending=False)
# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']
# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp4']
# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp4 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [311]:
coldel1 = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain', 'Response','AcceptedCmpOverall',
'MntFishProducts', 'MntSweetProducts', 'NumDealsPurchases']
In [314]:
df_camp.columns
Out[314]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
dtype='object')
In [316]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp4']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [318]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
'n_estimators': [100, 200],
'max_depth': [3, 6, 10],
'learning_rate': [0.01, 0.1],
'scale_pos_weight': [ratio_of_classes, 1], # Essayer différents poids
'colsample_bytree': [0.7, 1.0],
'subsample': [0.8, 1.0]
}
# Définir la grille de recherche pour Random Forest
param_grid_rf = {
'n_estimators': [100, 200],
'max_depth': [10, 20, None],
'class_weight': ['balanced', 'balanced_subsample']
}
# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)
# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)
# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp4 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp4.fit(X_resampled, y_resampled)
# Prédictions et évaluation
y_pred = model_Camp4.predict(X_test)
print(classification_report(y_test, y_pred))
# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'scale_pos_weight': 1.9969306322897482, 'subsample': 1.0}
Best parameters: {'class_weight': 'balanced_subsample', 'max_depth': 20, 'n_estimators': 100}
precision recall f1-score support
0.0 0.96 0.96 0.96 414
1.0 0.41 0.44 0.43 27
accuracy 0.93 441
macro avg 0.69 0.70 0.69 441
weighted avg 0.93 0.93 0.93 441
F1-score: 0.42857142857142855
MCC: 0.39016703715254414
In [320]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp4.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
In [321]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp4, "model_Camp4.joblib")
# Chargement du modèle
model4 = load("model_Camp4.joblib")
model4
Out[321]:
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced_subsample',
max_depth=20,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=N...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced_subsample',
max_depth=20,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=N...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')RandomForestClassifier(class_weight='balanced_subsample', max_depth=20,
random_state=42)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.7, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=10, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, random_state=42, ...)In [322]:
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp4']
# Effectuer les prédictions
y_pred = model4.predict(X)
# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp4'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model4.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9854677565849228 0.8975903614457831 0.9085365853658537 0.9030303030303031
precision recall f1-score support
0.0 0.99 0.99 0.99 2038
1.0 0.90 0.91 0.90 164
accuracy 0.99 2202
macro avg 0.95 0.95 0.95 2202
weighted avg 0.99 0.99 0.99 2202
AcceptedCmp4
0.0 0.925522
1.0 0.074478
Name: proportion, dtype: float64
[[2021 17]
[ 15 149]]
ROC-AUC: 0.9892439981809042
In [328]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'Response', 'AcceptedCmpOverall']
In [330]:
# Filtrer les colonnes numériques
campain5 = df_camp.drop(coldel, axis=1)
# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain5.corr(method='kendall')['AcceptedCmp5'].sort_values(ascending=False)
# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']
# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'AcceptedCmp5']
# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with AcceptedCmp5 (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [331]:
coldel1 = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain', 'Response', 'AcceptedCmpOverall',
'Customer_Days', 'Recency', 'Marital', 'Age']
df_camp.columns
Out[331]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
dtype='object')
In [334]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['AcceptedCmp5']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [336]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
'n_estimators': [100, 200],
'max_depth': [3, 6, 10],
'learning_rate': [0.01, 0.1],
'scale_pos_weight': [ratio_of_classes, 1], # Essayer différents poids
'colsample_bytree': [0.7, 1.0],
'subsample': [0.8, 1.0]
}
# Définir la grille de recherche pour Random Forest
param_grid_rf = {
'n_estimators': [100, 200],
'max_depth': [10, 20, None],
'class_weight': ['balanced', 'balanced_subsample']
}
# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)
# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)
# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp5 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp5.fit(X_resampled, y_resampled)
# Prédictions et évaluation
y_pred = model_Camp5.predict(X_test)
print(classification_report(y_test, y_pred))
# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 0.8}
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'n_estimators': 200}
precision recall f1-score support
0.0 0.98 0.95 0.97 415
1.0 0.47 0.69 0.56 26
accuracy 0.94 441
macro avg 0.73 0.82 0.76 441
weighted avg 0.95 0.94 0.94 441
F1-score: 0.5625
MCC: 0.5406668729694817
In [338]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp5.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
In [339]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp5, "model_Camp5.joblib")
# Chargement du modèle
model5 = load("model_Camp5.joblib")
model5
Out[339]:
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced',
n_estimators=200,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, ga...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=6,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=200, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced',
n_estimators=200,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, ga...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=6,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=200, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')RandomForestClassifier(class_weight='balanced', n_estimators=200,
random_state=42)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.7, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=200, n_jobs=None,
num_parallel_tree=None, random_state=42, ...)In [340]:
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['AcceptedCmp5']
# Effectuer les prédictions
y_pred = model5.predict(X)
# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['AcceptedCmp5'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model5.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9872842870118075 0.884393063583815 0.9503105590062112 0.9161676646706587
precision recall f1-score support
0.0 1.00 0.99 0.99 2041
1.0 0.88 0.95 0.92 161
accuracy 0.99 2202
macro avg 0.94 0.97 0.95 2202
weighted avg 0.99 0.99 0.99 2202
AcceptedCmp5
0.0 0.926885
1.0 0.073115
Name: proportion, dtype: float64
[[2021 20]
[ 8 153]]
ROC-AUC: 0.9972824184953789
In [346]:
# Colonnes à supprimer et à analyser
coldel = ['AcceptedCmp1', 'AcceptedCmp2','AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmpOverall']
In [348]:
# Filtrer les colonnes numériques
campain6 = df_camp.drop(coldel, axis=1)
# Calcul des corrélations Kendall pour toutes les variables par rapport à 'AcceptedCmp1'
correlations = campain6.corr(method='kendall')['Response'].sort_values(ascending=False)
# Créer un DataFrame pour visualiser
correlation_df = correlations.reset_index()
correlation_df.columns = ['Feature', 'Correlation']
# Exclure la variable cible elle-même (si nécessaire)
correlation_df = correlation_df[correlation_df['Feature'] != 'Response']
# Visualiser avec un barplot
plt.figure(figsize=(8, 4))
sns.barplot(data=correlation_df, x='Correlation', y='Feature', palette='coolwarm')
plt.title('Correlations with Response (Kendall)', fontsize=16, weight='bold', color='Firebrick')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('')
plt.yticks(fontsize=6)
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [349]:
coldel1 = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain','AcceptedCmpOverall',
'Response', 'NumWebVisitsMonth', 'Age']
df_camp.columns
Out[349]:
Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Complain', 'Response', 'Age', 'Customer_Days',
'AcceptedCmpOverall', 'MntTotal', 'MntRegularProds', 'Education_level',
'Adult_household', 'Marital', 'People_household', 'Total_Purchases'],
dtype='object')
In [354]:
# Diviser les données
X = df_camp.drop(coldel1, axis=1)
y = df_camp['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# SMOTE pour équilibrer les classes
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Utilisation de ADASYN à la place de SMOTE
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
# Calculer le ratio des classes
ratio_of_classes = len(y_resampled) / sum(y_resampled == 1)
In [356]:
# Définir la grille de recherche pour XGBoost
param_grid_xgb = {
'n_estimators': [100, 200],
'max_depth': [3, 6, 10],
'learning_rate': [0.01, 0.1],
'scale_pos_weight': [ratio_of_classes, 1], # Essayer différents poids
'colsample_bytree': [0.7, 1.0],
'subsample': [0.8, 1.0]
}
# Définir la grille de recherche pour Random Forest
param_grid_rf = {
'n_estimators': [100, 200],
'max_depth': [10, 20, None],
'class_weight': ['balanced', 'balanced_subsample']
}
# Appliquer GridSearchCV pour XGBoost
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, scoring='f1', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters:", grid_search_xgb.best_params_)
# Appliquer GridSearchCV pour RandomForest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, scoring='f1', cv=5)
grid_search_rf.fit(X_resampled, y_resampled)
best_rf = grid_search_rf.best_estimator_
print("Best parameters:", grid_search_rf.best_params_)
# Créer un modèle d'ensemble avec les meilleurs modèles
model_Camp6 = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='soft')
model_Camp6.fit(X_resampled, y_resampled)
# Prédictions et évaluation
y_pred = model_Camp6.predict(X_test)
print(classification_report(y_test, y_pred))
# Calcul du F1-score et MCC
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"F1-score: {f1}")
print(f"MCC: {mcc}")
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'scale_pos_weight': 1.9829172141918527, 'subsample': 0.8}
Best parameters: {'class_weight': 'balanced_subsample', 'max_depth': None, 'n_estimators': 200}
precision recall f1-score support
0.0 0.92 0.92 0.92 373
1.0 0.55 0.56 0.55 68
accuracy 0.86 441
macro avg 0.74 0.74 0.74 441
weighted avg 0.86 0.86 0.86 441
F1-score: 0.5547445255474452
MCC: 0.4728881523425176
In [358]:
from sklearn.metrics import auc
# Calcul de la courbe ROC et AUC
fpr, tpr, thresholds = roc_curve(y_test, model_Camp6.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
# Affichage de la courbe ROC
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, color='firebrick', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
In [360]:
from joblib import dump, load
from sklearn.metrics import roc_curve, auc
# Sauvegarder du modèle
dump(model_Camp6, "model_Camp6.joblib")
# Chargement du modèle
model6 = load("model_Camp6.joblib")
model6
Out[360]:
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced_subsample',
n_estimators=200,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_typ...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('rf',
RandomForestClassifier(class_weight='balanced_subsample',
n_estimators=200,
random_state=42)),
('xgb',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=0.7, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_typ...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=0.1, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=10,
max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...))],
voting='soft')RandomForestClassifier(class_weight='balanced_subsample', n_estimators=200,
random_state=42)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.7, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=10, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, random_state=42, ...)In [362]:
X = df_camp.drop(coldel1, axis=1)
y_true = df_camp['Response']
# Effectuer les prédictions
y_pred = model6.predict(X)
# Calculer les métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
print(accuracy, precision, recall, f1)
print(classification_report(y_true, y_pred))
print(df_camp['Response'].value_counts(normalize=True))
cm = confusion_matrix(y_true, y_pred)
print(cm)
auc = roc_auc_score(y_true, model6.predict_proba(X)[:, 1])
print(f"ROC-AUC: {auc}")
0.9677565849227975 0.8808139534883721 0.9099099099099099 0.8951255539143279
precision recall f1-score support
0.0 0.98 0.98 0.98 1869
1.0 0.88 0.91 0.90 333
accuracy 0.97 2202
macro avg 0.93 0.94 0.94 2202
weighted avg 0.97 0.97 0.97 2202
Response
0.0 0.848774
1.0 0.151226
Name: proportion, dtype: float64
[[1828 41]
[ 30 303]]
ROC-AUC: 0.9874834706295381
In [ ]:
from joblib import dump
# Save each model with a clear and explicit name.
dump(model_Camp1, "model_Camp1.joblib")
dump(model_Camp2, "model_Camp2.joblib")
dump(model_Camp3, "model_Camp3.joblib")
dump(model_Camp4, "model_Camp4.joblib")
dump(model_Camp5, "model_Camp5.joblib")
dump(model_Camp6, "model_Camp6.joblib")